########################################################################
# DATA LOAD AND CLEAN
########################################################################
final_data <- read_excel(here("data", "thesis_data_final.xlsx"))
final_data <- final_data %>% drop_na() %>% select_if(~n_distinct(.) > 1)
final_data <- final_data %>% mutate_if(is.character,as.factor)
final_data_scaled <- final_data
final_data_scaled$Age <- scale(final_data$Age)
final_data_scaled$BMI <- scale(final_data$BMI)
########################################################################
# MODEL SELECTION FOR STEPWISE TEST
########################################################################
#step.model <-glm(formula = Gender~ ., family = "binomial",
# data = data_train)
#summary(step.model)
#step <- stepAIC(step.model, direction="both")
#step
Step wise model is determined to be glm(formula = Gender ~ CM_IHD + Lung_Disease + Symptom_Fever + Symptom_Nausea_Vomiting, family = “binomial”, data = data_train)
########################################################################
# SPLIT DATA INTO TRAINING AND TEST SETS
########################################################################
#Set a random seed, in this way we can replicate the exact splits later on if needed
set.seed(568)
#Make the split rule
splits <- initial_split(final_data, prop = 0.7, strata = Gender)
#Use the split rule to retrieve the training data
data_train <- training(splits)
#Use the split rule to retrieve the test data
data_test <- testing(splits)
#Make a version of the test data where the outcome is not included.
test_no_outcome <- data_test %>% dplyr::select(-Gender)
test_y <- data_test %>% dplyr::select(Gender)
test_x <- data_test %>% dplyr::select(-Gender)
train_y <- data_train %>% dplyr::select(Gender)
train_x <- data_train %>% dplyr::select(-Gender)
save(data_test, file = "data_test.RData")
save(test_x, file = "test_x.RData")
########################################################################
# CREATE CROSS-VALIDATION FOLDS FOR MODEL EVALUATION/COMPARISON
########################################################################
#Prepare for 10-fold cross-validation, observations selected into folds with random
#sampling stratified on outcome
set.seed(877)
folds <- vfold_cv(data_train, v =10, strata = Gender)
#folds <- loo_cv(data_train)
save(folds, file = "cv_folds.RData")
########################################################################
# EVALUATION METRICS
########################################################################
#Which metrics should be computed?
my_metrics <- metric_set(roc_auc, recall, specificity, sensitivity, accuracy, bal_accuracy)
########################################################################
# CREATE RECIPE FOR PREPROCESSING DATA
########################################################################
#Create recipe for preprocessing data: undersampling majority class, categorical variables into dummy variables etc
train_rec <-
recipe(Gender ~ ., data = data_train) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
stepwise_rec <-
recipe(Gender ~ CM_IHD + Lung_Disease + Symptom_Fever +
Symptom_Nausea_Vomiting, data = data_train) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
########################################
# MODEL 1: Logistic regression
########################################
#Model specification
lr_mod <-
logistic_reg() %>%
set_engine("glm")
#Work flow: Which model to use and how data should be preprocessed
lr_wflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(train_rec)
#Use the workflow and folds object to fit model on cross-validation resamples
lr_fit_rs <-
lr_wflow %>%
fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))
## ! Fold01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading
## ! Fold04: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading
#Get mean out-of-sample performance measures
lr_metrics <- collect_metrics(lr_fit_rs)
lr_metrics
#Store part of the metrics object for later comparison with other models
lr_metrics_sub <- lr_metrics[ , c(1,3,5)]
lr_metrics_sub <- lr_metrics_sub %>%
pivot_longer(!.metric, names_to = "measure", values_to = ".estimate")
#Fit the above logistic regression model on the full training data
lr_fit_train <-
lr_wflow %>%
fit(data = data_train)
#Look at the model summary
summary(lr_fit_train$fit$fit$fit)
##
## Call:
## stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0956 -0.9002 0.5103 0.8861 2.0058
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7389 1.0708 -0.690 0.4902
## Age -0.2554 0.3003 -0.850 0.3951
## BMI -0.3526 0.2890 -1.220 0.2226
## CM_DM_Yes 0.1324 0.5734 0.231 0.8174
## CM_HTM_Yes -0.2804 0.5949 -0.471 0.6375
## CM_IHD_Yes 1.1068 1.0408 1.063 0.2876
## Lung_Disease_Yes -1.8690 1.3828 -1.352 0.1765
## Symptom_Fever_Yes 0.9089 0.7202 1.262 0.2069
## Symptom_Cough_Yes -0.5054 0.6279 -0.805 0.4209
## Symptom_Sputum_Yes -0.2885 0.7262 -0.397 0.6911
## Symptom_Runny_Nose_Yes -0.9847 1.3959 -0.705 0.4805
## Symptom_Breathlessness_Yes 0.8674 0.5803 1.495 0.1349
## Symptom_Abdominal_Pain_Yes -0.7399 0.9642 -0.767 0.4428
## Symptom_Nausea_Vomiting_Yes -1.4820 0.7187 -2.062 0.0392 *
## Symptom_Diarrhoea_Yes 0.4630 0.7718 0.600 0.5486
## Symptom_Anosmia_Hyposmia_Yes -0.7717 1.3778 -0.560 0.5754
## Symptom_Red_Eye_Yes -17.2961 2399.5449 -0.007 0.9942
## Symptom_Skin_Rash_Yes -17.0013 2399.5449 -0.007 0.9943
## Outcome_Discharged 0.8281 0.7882 1.051 0.2934
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 128.05 on 92 degrees of freedom
## Residual deviance: 102.65 on 74 degrees of freedom
## AIC: 140.65
##
## Number of Fisher Scoring iterations: 15
#Get the predicted class probabilities computed for the full training data
lr_pred_prob_train <- predict(lr_fit_train , type = "prob", new_data = data_train)
#Get the receiver operator curve (ROC) computed for the full training data
lr_train_roc <-roc_curve(tibble(Gender = data_train$Gender, lr_pred_prob_train), truth = Gender, .estimate = .pred_Female) %>%
mutate(model = "Log_reg")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the yardstick package.
## Please report the issue at <https://github.com/tidymodels/yardstick/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
lr_pred_class_test <- predict(lr_fit_train , type = "class", new_data = data_test)
lr_pred_prob_test <- predict(lr_fit_train , type = "prob", new_data = data_test)
#When you have test data without outcome
lr_pred_class_test_x <- predict(lr_fit_train , type = "class", new_data = test_no_outcome)
lr_pred_prob_test_x<- predict(lr_fit_train , type = "prob", new_data = test_no_outcome)
################################################
# MODEL 2: Stepwise logistic regression
################################################
#Model specification
step_mod <-
logistic_reg() %>%
set_engine("glm")
#Work flow: Which model to use and how data should be preprocessed
step_wflow <-
workflow() %>%
add_model(step_mod) %>%
add_recipe(stepwise_rec)
#Use the workflow and folds object to fit model on cross-validation resamples
step_fit_rs <-
step_wflow %>%
fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))
#Get mean out-of-sample performance measures
step_metrics <- collect_metrics(step_fit_rs)
step_metrics
#Store part of the metrics object for later comparison with other models
step_metrics_sub <- step_metrics[ , c(1,3,5)]
step_metrics_sub <- step_metrics_sub %>%
pivot_longer(!.metric, names_to = "measure", values_to = ".estimate")
#Fit the above logistic regression model on the full training data
step_fit_train <-
step_wflow %>%
fit(data = data_train)
#Look at the model summary
summary(step_fit_train$fit$fit$fit)
##
## Call:
## stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9318 -1.0373 0.8933 0.8933 2.0162
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3388 0.5767 -0.588 0.55686
## CM_IHD_Yes 0.9852 0.8424 1.170 0.24219
## Lung_Disease_Yes -1.7662 1.3218 -1.336 0.18148
## Symptom_Fever_Yes 1.0514 0.6095 1.725 0.08452 .
## Symptom_Nausea_Vomiting_Yes -1.5532 0.5547 -2.800 0.00511 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 128.05 on 92 degrees of freedom
## Residual deviance: 113.89 on 88 degrees of freedom
## AIC: 123.89
##
## Number of Fisher Scoring iterations: 4
#Get the predicted class probabilities computed for the full training data
step_pred_prob_train <- predict(step_fit_train , type = "prob", new_data = data_train)
#Get the receiver operator curve (ROC) computed for the full training data
stepwise_train_roc <-roc_curve(tibble(Gender = data_train$Gender, step_pred_prob_train), truth = Gender, .estimate = .pred_Female) %>%
mutate(model = "Step_reg")
#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
step_pred_class_test <- predict(step_fit_train , type = "class", new_data = data_test)
step_pred_prob_test <- predict(step_fit_train , type = "prob", new_data = data_test)
#When you have test data without outcome
step_pred_class_test_x <- predict(step_fit_train , type = "class", new_data = test_no_outcome)
step_pred_prob_test_x<- predict(step_fit_train , type = "prob", new_data = test_no_outcome)
################################################
# MODEL 3: Penalized logistic regression (Ridge)
################################################
#Model specification
ridge_mod <-
logistic_reg(mixture = 0, penalty = tune()) %>%
set_engine("glmnet") %>%
set_mode("classification")
lambda_grid <- grid_regular(penalty(), levels = 100)
#Set up workflow
ridge_wflow <-
workflow() %>%
add_model(ridge_mod) %>%
add_recipe(train_rec)
#Get a parameter object for our data and model specification. Contains information about possible values, ranges, types etc.
ridge_param <-
ridge_wflow %>%
parameters() %>%
finalize(data_train)
## Warning: `parameters.workflow()` was deprecated in tune 0.1.6.9003.
## ℹ Please use `hardhat::extract_parameter_set_dials()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Look at the range for the penalty parameter
ridge_param %>% pull_dials_object("penalty")
## Warning: `pull_dials_object()` was deprecated in dials 0.1.0.
## ℹ Please use `hardhat::extract_parameter_dials()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Amount of Regularization (quantitative)
## Transformer: log-10 [1e-100, Inf]
## Range (transformed scale): [-10, 0]
#Tune the model: Set up a grid of penalty values to be evalutated and select the optimal penalty value (in terms of AUROC)
set.seed(99)
ridge_tune <-
ridge_wflow %>%
tune_grid(
folds,
grid = ridge_param %>% grid_regular(levels = c(penalty = 200)),
metrics = my_metrics
)
#View plot of penalty values vs. AUROC
autoplot(ridge_tune) + theme(legend.position = "top")
#View the penalty values with largest AUROC
show_best(ridge_tune) %>% select(-.estimator)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
#Store the best penalty value
ridge_best_param <- select_best(ridge_tune, "roc_auc")
#Set up the final workflow using the best penalty value
final_ridge_wflow <-
ridge_wflow %>%
finalize_workflow(ridge_best_param)
#View the workflow specifiations
final_ridge_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = 1
## mixture = 0
##
## Computational engine: glmnet
#Fit the final model on the cross-validation folds set up for model evaluation/comparison
ridge_fit_rs <-
final_ridge_wflow %>%
fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))
#Get mean out-of-sample performance measures
ridge_metrics <- collect_metrics(ridge_fit_rs)
ridge_metrics
#Store part of the metrics object for later comparison with other models
ridge_metrics_sub <- ridge_metrics[, c(1,3,5)]
ridge_metrics_sub <- ridge_metrics_sub %>%
pivot_longer(!.metric, names_to = "measure", values_to = "estimate")
#Fit the final model on the full training data
ridge_fit_train <-
final_ridge_wflow %>%
fit(data = data_train)
#Look at variable importance
ridge_fit_train%>%
pull_workflow_fit() %>%
vip(lambda = ridge_best_param$penalty, num_features = 200)
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## ℹ Please use `extract_fit_parsnip()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Get the model coefficients
ridge_coeff <- data.frame(ridge_fit_train %>%
pull_workflow_fit() %>%
tidy())
#Number of non-zero coefficients
sum(ridge_coeff$estimate != 0)
## [1] 19
#Number of zero coefficients
sum(ridge_coeff$estimate == 0)
## [1] 0
#Get the predicted class probabilities computed for the full training data
ridge_pred_prob_train <- predict(ridge_fit_train , type = "prob", new_data = data_train)
#Get the receiver operator curve (ROC) computed for the full training data
ridge_train_roc <- roc_curve(tibble(Gender = data_train$Gender, ridge_pred_prob_train), truth = Gender, estimate =.pred_Female) %>%
mutate(model = "Ridge_reg")
#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
ridge_pred_class_test <- predict(ridge_fit_train , type = "class", new_data = data_test)
ridge_pred_prob_test <- predict(ridge_fit_train , type = "prob", new_data = data_test)
#When you have test data without outcome
ridge_pred_class_test_no_outcome <- predict(ridge_fit_train , type = "class", new_data = test_no_outcome)
ridge_pred_prob_test_no_outcome <- predict(ridge_fit_train , type = "prob", new_data = test_no_outcome)
################################################
# MODEL 4: Penalized logistic regression (LASSO)
################################################
#Model specification
lasso_mod <-
logistic_reg(mixture = 1, penalty = tune()) %>% #Specify that we want to tune the penalty parameter
set_engine("glmnet") %>%
set_mode("classification")
lambda_grid <- grid_regular(penalty(), levels = 100)
#Set up workflow
lasso_wflow <-
workflow() %>%
add_model(lasso_mod) %>%
add_recipe(train_rec)
#Get a parameter object for our data and model specification. Contains information about possible values, ranges, types etc.
lasso_param <-
lasso_wflow %>%
parameters() %>%
finalize(data_train)
#Look at the range for the penalty parameter
lasso_param%>% pull_dials_object("penalty")
## Amount of Regularization (quantitative)
## Transformer: log-10 [1e-100, Inf]
## Range (transformed scale): [-10, 0]
#Tune the model: Set up a grid of penalty values to be evalutated and select the optimal penalty value (in terms of AUROC)
set.seed(99)
lasso_tune <-
lasso_wflow %>%
tune_grid(
folds,
grid = lasso_param %>% grid_regular(levels = c(penalty = 200)),
metrics = my_metrics
)
#View plot of penalty values vs. AUROC
autoplot(lasso_tune) + theme(legend.position = "top")
#View the penalty values with largest AUROC
show_best(lasso_tune) %>% select(-.estimator)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
#Store the best penalty value
lasso_best_param <- select_best(lasso_tune, "roc_auc")
#Set up the final workflow using the best penalty value
final_lasso_wflow <-
lasso_wflow %>%
finalize_workflow(lasso_best_param)
#View the workflow specifiations
final_lasso_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = 0.124588336429501
## mixture = 1
##
## Computational engine: glmnet
#Fit the final model on the cross-validation folds set up for model evaluation/comparison
lasso_fit_rs <-
final_lasso_wflow %>%
fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))
#Get mean out-of-sample performance measures
lasso_metrics <- collect_metrics(lasso_fit_rs)
lasso_metrics
#Store part of the metrics object for later comparison with other models
lasso_metrics_sub <- lasso_metrics[, c(1,3,5)]
lasso_metrics_sub <- lasso_metrics_sub %>%
pivot_longer(!.metric, names_to = "measure", values_to = "estimate")
#Fit the final model on the full training data
lasso_fit_train <-
final_lasso_wflow %>%
fit(data = data_train)
#Look at variable importance
lasso_fit_train%>%
pull_workflow_fit() %>%
vip(lambda = lasso_best_param$penalty, num_features = 200)
#Get the model coefficients
lasso_coeff <- data.frame(lasso_fit_train %>%
pull_workflow_fit() %>%
tidy())
#Number of non-zero coefficients
sum(lasso_coeff$estimate != 0)
## [1] 2
#Number of zero coefficients
sum(lasso_coeff$estimate == 0)
## [1] 17
#Get the predicted class probabilities computed for the full training data
lasso_pred_prob_train <- predict(lasso_fit_train , type = "prob", new_data = data_train)
#Get the receiver operator curve (ROC) computed for the full training data
lasso_train_roc <- roc_curve(tibble(Gender = data_train$Gender, lasso_pred_prob_train), truth = Gender, estimate =.pred_Female) %>%
mutate(model = "Lasso_reg")
#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
lasso_pred_class_test <- predict(lasso_fit_train , type = "class", new_data = data_test)
lasso_pred_prob_test <- predict(lasso_fit_train , type = "prob", new_data = data_test)
#When you have test data without outcome
lasso_pred_class_test_no_outcome <- predict(lasso_fit_train , type = "class", new_data = test_no_outcome)
lasso_pred_prob_test_no_outcome <- predict(lasso_fit_train , type = "prob", new_data = test_no_outcome)
################################################
# MODEL 5: Logistic elastic net regression (ELNET)
################################################
#Model specification
elnet_mod <-
logistic_reg(mixture = tune(), penalty = tune()) %>% #Specify that we want to tune the penalty parameter and the mixture
set_engine("glmnet") %>%
set_mode("classification")
lambda_grid <- grid_regular(penalty(), levels = 50)
#Set up workflow
elnet_wflow <-
workflow() %>%
add_model(elnet_mod) %>%
add_recipe(train_rec)
#Get a parameter object for our data and model specification. Contains information about possible values, ranges, types etc.
elnet_param <-
elnet_wflow %>%
parameters() %>%
finalize(data_train)
#Look at the range for the penalty parameter
elnet_param%>% pull_dials_object("penalty")
## Amount of Regularization (quantitative)
## Transformer: log-10 [1e-100, Inf]
## Range (transformed scale): [-10, 0]
#Tune the model: Set up a grid of penalty values to be evalutated and select the optimal penalty value (in terms of AUROC)
set.seed(99)
elnet_tune <-
elnet_wflow %>%
tune_grid(
folds,
grid = elnet_param %>% grid_regular(levels = c(penalty = 100, mixture = 10)),
metrics = my_metrics
)
#View plot of penalty values vs. AUROC
autoplot(elnet_tune) + theme(legend.position = "top")
#View the penalty values with largest AUROC
show_best(elnet_tune) %>% select(-.estimator)
## Warning: No value of `metric` was given; metric 'roc_auc' will be used.
#Store the best penalty value
elnet_best_param <- select_best(elnet_tune, "roc_auc")
#Set up the final workflow using the best penalty value
final_elnet_wflow <-
elnet_wflow %>%
finalize_workflow(elnet_best_param)
#View the workflow specifiations
final_elnet_wflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = 0.792482898353919
## mixture = 0.05
##
## Computational engine: glmnet
#Fit the final model on the cross-validation folds set up for model evaluation/comparison
elnet_fit_rs <-
final_elnet_wflow %>%
fit_resamples(folds, metrics = my_metrics, control = control_resamples(save_pred = TRUE))
#Get mean out-of-sample performance measures
elnet_metrics <- collect_metrics(elnet_fit_rs)
elnet_metrics
#Store part of the metrics object for later comparison with other models
elnet_metrics_sub <- elnet_metrics[, c(1,3,5)]
elnet_metrics_sub <- elnet_metrics_sub %>%
pivot_longer(!.metric, names_to = "measure", values_to = "estimate")
#Fit the final model on the full training data
elnet_fit_train <-
final_elnet_wflow %>%
fit(data = data_train)
#Look at variable importance
elnet_fit_train%>%
pull_workflow_fit() %>%
vip(lambda = elnet_best_param$penalty, num_features = 200)
#Get the model coefficients
elnet_coeff <- data.frame(elnet_fit_train %>%
pull_workflow_fit() %>%
tidy())
#Number of non-zero coefficients
sum(elnet_coeff$estimate != 0)
## [1] 12
#Number of zero coefficients
sum(elnet_coeff$estimate == 0)
## [1] 7
#Get the predicted class probabilities computed for the full training data
elnet_pred_prob_train <- predict(elnet_fit_train , type = "prob", new_data = data_train)
#Get the receiver operator curve (ROC) computed for the full training data
elnet_train_roc <- roc_curve(tibble(Gender = data_train$Gender, elnet_pred_prob_train), truth = Gender, estimate =.pred_Female) %>%
mutate(model = "Elnet")
#Get predicted class (outcome) and class probabilities for the test data
#When you have test data with outcome
elnet_pred_class_test <- predict(elnet_fit_train , type = "class", new_data = data_test)
elnet_pred_prob_test <- predict(elnet_fit_train , type = "prob", new_data = data_test)
#When you have test data without outcome
elnet_pred_class_test_no_outcome <- predict(elnet_fit_train , type = "class", new_data = test_no_outcome)
elnet_pred_prob_test_no_outcome <- predict(elnet_fit_train , type = "prob", new_data = test_no_outcome)
library(inspectdf)
library(skimr)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(MuMIn)
library(plotmo)
## Loading required package: Formula
## Loading required package: plotrix
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:scales':
##
## rescale
## Loading required package: TeachingDemos
library(separationplot)
## Loading required package: RColorBrewer
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:TeachingDemos':
##
## cnvrt.coords, subplot
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following object is masked from 'package:parsnip':
##
## translate
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: foreign
library(ggfortify)
## Registered S3 method overwritten by 'ggfortify':
## method from
## autoplot.glmnet parsnip
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
library(cvms)
library(broom)
library(yardstick)
library(glmnet)
library(glmtoolbox)
##
## Attaching package: 'glmtoolbox'
## The following object is masked from 'package:MuMIn':
##
## QIC
######################################################
# SUMMARIZE RESULTS BASED ON TRAINING DATA EVALUATIONS
######################################################
#Combine the results from different models in one tibble
metrics_table_data_train <- bind_cols(lr_metrics_sub[c(11:12, 1:10, 13:14), 1:3], step_metrics_sub [c(11:12, 1:10, 13:14), 1:3],ridge_metrics_sub[c(11:12, 1:10, 13:14), 3], lasso_metrics_sub[c(11:12, 1:10, 13:14), 3] , elnet_metrics_sub[c(11:12, 1:10, 13:14), 3])
## New names:
## • `.metric` -> `.metric...1`
## • `measure` -> `measure...2`
## • `.estimate` -> `.estimate...3`
## • `.metric` -> `.metric...4`
## • `measure` -> `measure...5`
## • `.estimate` -> `.estimate...6`
## • `estimate` -> `estimate...7`
## • `estimate` -> `estimate...8`
## • `estimate` -> `estimate...9`
colnames(metrics_table_data_train) <- c("Metric", "Measure", "Log_reg", "Step_reg", "Ridge_reg", "Lasso_reg", "Elnet")
#Convert the tibble to a data.frame
results_table_train <- data.frame(metrics_table_data_train)
#Produce a table with results based on training data
results_table_train %>%
kbl(caption = "Table 3. Model performance based on 10-fold CV on training data (n = 94)", digits = 3) %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
collapse_rows(columns = 1, valign = "top")
| Metric | Measure | Log_reg | Step_reg | Ridge_reg | Lasso_reg | Elnet | NA. | NA..1 |
|---|---|---|---|---|---|---|---|---|
| specificity | mean | 0.720 | specificity | mean | 0.840 | 0.840 | 0.860 | 0.860 |
| specificity | std_err | 0.053 | specificity | std_err | 0.058 | 0.040 | 0.060 | 0.043 |
| accuracy | mean | 0.553 | accuracy | mean | 0.714 | 0.544 | 0.516 | 0.545 |
| accuracy | std_err | 0.059 | accuracy | std_err | 0.049 | 0.040 | 0.023 | 0.041 |
| bal_accuracy | mean | 0.532 | bal_accuracy | mean | 0.700 | 0.510 | 0.480 | 0.510 |
| bal_accuracy | std_err | 0.062 | bal_accuracy | std_err | 0.049 | 0.041 | 0.022 | 0.042 |
| recall | mean | 0.345 | recall | mean | 0.560 | 0.180 | 0.100 | 0.160 |
| recall | std_err | 0.094 | recall | std_err | 0.055 | 0.052 | 0.055 | 0.046 |
| roc_auc | mean | 0.509 | roc_auc | mean | 0.658 | 0.552 | 0.598 | 0.601 |
| roc_auc | std_err | 0.076 | roc_auc | std_err | 0.052 | 0.072 | 0.049 | 0.062 |
| sensitivity | mean | 0.345 | sensitivity | mean | 0.560 | 0.180 | 0.100 | 0.160 |
| sensitivity | std_err | 0.094 | sensitivity | std_err | 0.055 | 0.052 | 0.055 | 0.046 |
| NA | NA | NA | NA | NA | NA | NA | NA | NA |
| NA | NA | NA | NA | NA | NA | NA | NA | NA |
#Plot ROC:s on final models fit on full training data
bind_rows(lr_train_roc, stepwise_train_roc, ridge_train_roc,lasso_train_roc, elnet_train_roc) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
scale_color_viridis_d(option = "plasma", end = .6)+
ggtitle("ROC Based on Training Data")
######################################################
# SUMMARIZE RESULTS BASED ON TEST DATA EVALUATION
######################################################
#Prepare results for producing a table
lr_results_test <- tibble(Gender = data_test$Gender, lr_pred_prob_test, lr_pred_class_test)
step_results_test <- tibble(Gender = data_test$Gender, step_pred_prob_test, step_pred_class_test)
ridge_results_test <- tibble(Gender = data_test$Gender, ridge_pred_prob_test, ridge_pred_class_test)
lasso_results_test <- tibble(Gender = data_test$Gender, lasso_pred_prob_test, lasso_pred_class_test)
elnet_results_test <- tibble(Gender = data_test$Gender, elnet_pred_prob_test, elnet_pred_class_test)
lr_metrics_sub_test <- bind_rows(roc_auc(lr_results_test, Gender, .pred_Female)[, c(1,3)],
accuracy(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
bal_accuracy(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
f_meas(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
precision(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
recall(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
specificity(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
sensitivity(lr_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])
step_metrics_sub_test <- bind_rows(roc_auc(step_results_test, Gender, .pred_Female)[, c(1,3)],
accuracy(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
bal_accuracy(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
f_meas(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
precision(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
recall(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
specificity(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
sensitivity(step_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])
ridge_metrics_sub_test <- bind_rows(roc_auc(ridge_results_test, Gender, .pred_Female)[, c(1,3)],
accuracy(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
bal_accuracy(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
f_meas(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
precision(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
recall(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
specificity(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
sensitivity(ridge_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])
lasso_metrics_sub_test <- bind_rows(roc_auc(lasso_results_test, Gender, .pred_Female)[, c(1,3)],
accuracy(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
bal_accuracy(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
f_meas(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
precision(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
recall(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
specificity(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
sensitivity(lasso_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])
elnet_metrics_sub_test <- bind_rows(roc_auc(elnet_results_test, Gender, .pred_Female)[, c(1,3)],
accuracy(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
bal_accuracy(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
f_meas(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
precision(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
recall(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
specificity(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)],
sensitivity(elnet_results_test, truth = Gender, estimate = .pred_class)[, c(1,3)])
#Combine the results from different models in one tibble
metrics_table_data_test <- bind_cols(lr_metrics_sub_test[, 1:2], step_metrics_sub_test[, 2], ridge_metrics_sub_test[, 2], lasso_metrics_sub_test[, 2], elnet_metrics_sub_test[, 2])
## New names:
## • `.estimate` -> `.estimate...2`
## • `.estimate` -> `.estimate...3`
## • `.estimate` -> `.estimate...4`
## • `.estimate` -> `.estimate...5`
## • `.estimate` -> `.estimate...6`
colnames(metrics_table_data_test) <- c("Metric", "Logistic", "Stepwise", "Ridge", "Lasso", "Elnet")
#Convert the tibble to a data.frame
results_table_test <- data.frame(metrics_table_data_test)
#Produce a table with results based on test data
results_table_test %>%
kbl(caption = "Table 4. Model performance evaluated on test data (n = 40)", digits = 3) %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
collapse_rows(columns = 1, valign = "top")
| Metric | Logistic | Stepwise | Ridge | Lasso | Elnet |
|---|---|---|---|---|---|
| roc_auc | 0.751 | 0.711 | 0.761 | 0.518 | 0.688 |
| accuracy | 0.634 | 0.683 | 0.610 | 0.561 | 0.561 |
| bal_accuracy | 0.620 | 0.663 | 0.580 | 0.518 | 0.518 |
| f_meas | 0.545 | 0.581 | 0.429 | 0.250 | 0.250 |
| precision | 0.600 | 0.692 | 0.600 | 0.500 | 0.500 |
| recall | 0.500 | 0.500 | 0.333 | 0.167 | 0.167 |
| specificity | 0.739 | 0.826 | 0.826 | 0.870 | 0.870 |
| sensitivity | 0.500 | 0.500 | 0.333 | 0.167 | 0.167 |
#Plot ROC:s using final models fit on full training data to predict on test data
lr_test_roc <- roc_curve(tibble(Gender = data_test$Gender, lr_pred_prob_test), truth = Gender, estimate =.pred_Female) %>%
mutate(model = "Log_reg")
stepwise_test_roc <- roc_curve(tibble(Gender = data_test$Gender, step_pred_prob_test), truth = Gender, estimate =.pred_Female) %>%
mutate(model = "Stepwise_reg")
ridge_test_roc <- roc_curve(tibble(Gender = data_test$Gender, ridge_pred_prob_test), truth = Gender, estimate =.pred_Female) %>%
mutate(model = "Ridge_reg")
lasso_test_roc <- roc_curve(tibble(Gender = data_test$Gender, lasso_pred_prob_test), truth = Gender, estimate =.pred_Female) %>%
mutate(model = "Lasso_reg")
elnet_test_roc <- roc_curve(tibble(Gender = data_test$Gender, elnet_pred_prob_test), truth = Gender, estimate =.pred_Female) %>%
mutate(model = "Elnet_reg")
bind_rows(lr_test_roc, stepwise_test_roc, ridge_test_roc,lasso_test_roc, elnet_test_roc) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1.5, alpha = 0.8, position=position_dodge(width=0.05)) +
geom_abline(lty = 3) +
coord_equal() +
scale_color_viridis_d(option = "turbo", end = .6)+
ggtitle("ROC Based on Test Data")
## Warning: `position_dodge()` requires non-overlapping x intervals
yes, this is “legal”. If the jump from one threshold to the next raises the amount of false positives and false negatives together the result is a diagonal line. Two reasons that might happen:
You have 2 observations with same threshold but with different ground truth The resolution between 2 thresholds is large enough - in that case you may also check a threshold between the two.
lasso_tune %>%
collect_metrics()
######################################################
# LASSO PLOTS
######################################################
lasso_tune %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
#geom_errorbar(aes(
# ymin = mean - std_err,
# ymax = mean + std_err
#),
#alpha = 0.5
#) +
geom_line(size = 1.5) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
best_auc <- lasso_tune %>%
select_best("roc_auc", maximize = T)
## Warning: The `maximize` argument is no longer needed. This value was ignored.
lasso_fit_train %>%
fit(data_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_auc$penalty) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL) +
ggtitle("Lasso Model - Explanatory Variable Importance")
######################################################
# RIDGE PLOTS
######################################################
ridge_tune %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
#geom_errorbar(aes(
# ymin = mean - std_err,
# ymax = mean + std_err
#),
#alpha = 0.5
#) +
geom_line(size = 1.5) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
best_auc <- ridge_tune %>%
select_best("roc_auc", maximize = T)
## Warning: The `maximize` argument is no longer needed. This value was ignored.
ridge_fit_train %>%
fit(data_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_auc$penalty) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL) +
ggtitle("Ridge Regression Model - Explanatory Variable Importance")
######################################################
# ELNET PLOTS
######################################################
elnet_tune %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
#geom_errorbar(aes(
# ymin = mean - std_err,
# ymax = mean + std_err
#),
#alpha = 0.5
#) +
geom_line(size = .5) +
facet_wrap(~.metric , scales = "free", nrow = 6) +
scale_x_log10() +
theme(legend.position = "none")
best_auc <- elnet_tune %>%
select_best("roc_auc", maximize = T)
## Warning: The `maximize` argument is no longer needed. This value was ignored.
elnet_fit_train %>%
fit(data_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_auc$penalty, num_features =20) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL) +
ggtitle("Elastic Net Model - Explanatory Variable Importance")
######################################################
# STEPWISE PLOTS
######################################################
vi((step_fit_train$fit$fit$fit)) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL) +
ggtitle("Stepwise Logisitc Regression - Explanatory Variable Importance")
######################################################
# LR PLOTS
######################################################
vi((lr_fit_train$fit$fit$fit )) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL) +
ggtitle("Logistic Regression - Explanatory Variable Importance")
lasso_fit_train$fit$fit$spec
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = 0.124588336429501
## mixture = 1
##
## Computational engine: glmnet
##
## Model fit template:
## glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(),
## alpha = 1, family = "binomial")
ridge_fit_train$fit$fit$spec
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = 1
## mixture = 0
##
## Computational engine: glmnet
##
## Model fit template:
## glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(),
## alpha = 0, family = "binomial")
elnet_fit_train$fit$fit$spec
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = 0.792482898353919
## mixture = 0.05
##
## Computational engine: glmnet
##
## Model fit template:
## glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(),
## alpha = 0.05, family = "binomial")
######################################################
# GODNESS OF FIT
######################################################
train_x <- model.matrix(Gender ~ ., data_train)[, -1]
train_y <- data_train$Gender
test_x <- model.matrix(Gender ~ ., data_test)[, -1]
test_y <- data_test$Gender
lasso_mod <- (glmnet(test_x, test_y, family = "binomial", alpha = 1, lambda =0.024658110758226))
ridge_mod <- (glmnet(test_x, test_y, family = "binomial", alpha = 0, lambda =0.0195639834351706))
elnet_mod <- (glmnet(test_x, test_y, family = "binomial", alpha = 1, lambda =0.0242012826479438))
lr_deviance <- lr_fit_train$fit$fit$fit$deviance
#lr_fit_train$fit$fit$fit$aic%>% tidy()
step_deviance <-step_fit_train$fit$fit$fit$deviance
#step_fit_train$fit$fit$fit$aic%>% tidy()
lasso_deviance <- deviance(lasso_mod)
ridge_deviance <- deviance(ridge_mod)
elnet_deviance <- deviance(elnet_mod)
deviances <- cbind("Deviance", lr_deviance, step_deviance, ridge_deviance, lasso_deviance, elnet_deviance)
#p-value = 1 - pchisq(deviance, degrees of freedom)
summary(step_fit_train$fit$fit$fit)
##
## Call:
## stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9318 -1.0373 0.8933 0.8933 2.0162
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3388 0.5767 -0.588 0.55686
## CM_IHD_Yes 0.9852 0.8424 1.170 0.24219
## Lung_Disease_Yes -1.7662 1.3218 -1.336 0.18148
## Symptom_Fever_Yes 1.0514 0.6095 1.725 0.08452 .
## Symptom_Nausea_Vomiting_Yes -1.5532 0.5547 -2.800 0.00511 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 128.05 on 92 degrees of freedom
## Residual deviance: 113.89 on 88 degrees of freedom
## AIC: 123.89
##
## Number of Fisher Scoring iterations: 4
lr_fit_train$fit$fit$fit$df.residual
## [1] 74
step_fit_train$fit$fit$fit$df.residual
## [1] 88
lasso_mod$df
## [1] 12
ridge_mod$df
## [1] 18
elnet_mod$df
## [1] 12
p_lr_dev <- 1 - pchisq(lr_deviance, lr_fit_train$fit$fit$fit$df.residual)
p_lr_dev
## [1] 0.01543401
p_step_dev <- 1 - pchisq(step_deviance, step_fit_train$fit$fit$fit$df.residual)
p_step_dev
## [1] 0.03313262
p_ridge_dev <- 1- pchisq(ridge_deviance, ridge_mod$df)
p_ridge_dev
## [1] 0.2729321
p_lasso_dev <- 1- pchisq(lasso_deviance, lasso_mod$df)
p_lasso_dev
## [1] 0.009091613
p_elnet_dev <- 1- pchisq(elnet_deviance, elnet_mod$df)
p_elnet_dev
## [1] 0.009673136
lasso_coeff
#LASSO
tLL <- -deviance(lasso_mod)
k <- lasso_mod$df
n <- lasso_mod$nobs
AICc <- -tLL+2*k+2*k*(k+1)/(n-k-1)
AIC_ <- -tLL+2*k
BIC<-log(n)*k - tLL
h <- c("Model", "DF", "AIC", "BIC", "AICc", "Deviance")
l <- c("Lasso",k, AIC_, BIC, AICc, lasso_deviance)
#RIDGE
tLL <- -deviance(ridge_mod)
k <- ridge_mod$df
n <- ridge_mod$nobs
AICc <- -tLL+2*k+2*k*(k+1)/(n-k-1)
AIC_ <- -tLL+2*k
BIC<-log(n)*k - tLL
r <- c("Ridge",k, AIC_, BIC, AICc, ridge_deviance)
#ELNET
tLL <- -deviance(elnet_mod)
k <- elnet_mod$df
n <- elnet_mod$nobs
AICc <- -tLL+2*k+2*k*(k+1)/(n-k-1)
AIC_ <- -tLL+2*k
BIC<-log(n)*k - tLL
e <- c("Elnet",k, AIC_, BIC, AICc, elnet_deviance)
rbind(h,r,l, e)
## [,1] [,2] [,3] [,4] [,5]
## h "Model" "DF" "AIC" "BIC" "AICc"
## r "Ridge" "18" "57.1294591736847" "87.9737563743623" "88.2203682645938"
## l "Lasso" "12" "50.5074134032682" "71.0702782037199" "61.6502705461253"
## e "Elnet" "12" "50.3184903173487" "70.8813551178004" "61.4613474602058"
## [,6]
## h "Deviance"
## r "21.1294591736847"
## l "26.5074134032682"
## e "26.3184903173487"
#LR
AIC_<-lr_fit_train$fit$fit$fit$aic
k <-lr_fit_train$fit$fit$fit$df.residual
AICc <-AICc(lr_fit_train$fit$fit$fit)
BIC <-BIC(lr_fit_train$fit$fit$fit)
lr <- c("Logistic",k, AIC_, BIC, AICc, lr_deviance)
#STEP
AIC_<-step_fit_train$fit$fit$fit$aic
k <-step_fit_train$fit$fit$fit$df.residual
AICc <- AICc(step_fit_train$fit$fit$fit)
BIC <-BIC(step_fit_train$fit$fit$fit)
s <- c("Stepwise",k, AIC_, BIC, AICc, step_deviance)
models <- data.frame(rbind(h,lr, s,r,l, e))
names(models) <- models[1,]
models <- models[-1,]
rownames(models) <- NULL
models <- models %>%
mutate_at("AIC", as.numeric) %>%
mutate_at("AICc", as.numeric)%>%
mutate_at("BIC", as.numeric)%>%
mutate_at("Deviance", as.numeric) %>% mutate_if(is.numeric, round,digits=2)
models %>%
kbl(caption = "Table x. )") %>%
kable_classic(full_width = F, html_font = "Cambria") %>%
collapse_rows(columns = 1, valign = "top")
| Model | DF | AIC | BIC | AICc | Deviance |
|---|---|---|---|---|---|
| Logistic | 74 | 140.65 | 188.77 | 151.06 | 102.65 |
| Stepwise | 88 | 123.89 | 136.55 | 124.58 | 113.89 |
| Ridge | 18 | 57.13 | 87.97 | 88.22 | 21.13 |
| Lasso | 12 | 50.51 | 71.07 | 61.65 | 26.51 |
| Elnet | 12 | 50.32 | 70.88 | 61.46 | 26.32 |
save(models, file = "model_compare.RData")
hist(elnet_pred_prob_train$.pred_Female)
hist(elnet_pred_prob_train$.pred_Male)
summary(lasso_mod$dev.ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5286 0.5286 0.5286 0.5286 0.5286 0.5286
ridge_mod$dev.ratio
## [1] 0.6242101
elnet_mod$dev.ratio
## [1] 0.5319226
plot(residuals(step_fit_train$fit$fit$fit))
plot(residuals(lr_fit_train$fit$fit$fit))
#plot(residuals(lasso_fit_train$fit$fit$fit$nobs))
plot(lasso_fit_train$fit$fit$fit)
plot_glmnet(elnet_fit_train$fit$fit$fit ,
xvar = c("rlambda", "lambda", "norm", "dev"),
label = 10, nresponse = NA, grid.col = NA, s = NA)
moo <-glmnet(train_x, train_y, family = "binomial", alpha = 1, lambda =0.024658110758226)
plotres(moo,w1.col=1:9, w1.nresponse="Male", w1.xvar = c( "dev"))
elnet_mod <- (glmnet::glmnet(test_x, test_y, family = "binomial", alpha = 1, lambda =0.0242012826479438, type.measure = "deviance"))
elnet_mod$nulldev
## [1] 56.22679
library (sure)
resid(elnet_pred_class_test)
## Warning: Unknown or uninitialised column: `na.action`.
## Warning: Unknown or uninitialised column: `residuals`.
## NULL
elnet_fit_rs$.predictions
## [[1]]
## # A tibble: 11 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.446 0.554 18 Male Female Preprocessor1_Model1
## 2 0.410 0.590 21 Male Female Preprocessor1_Model1
## 3 0.421 0.579 23 Male Female Preprocessor1_Model1
## 4 0.510 0.490 35 Female Female Preprocessor1_Model1
## 5 0.501 0.499 42 Female Female Preprocessor1_Model1
## 6 0.421 0.579 52 Male Male Preprocessor1_Model1
## 7 0.414 0.586 57 Male Male Preprocessor1_Model1
## 8 0.427 0.573 62 Male Male Preprocessor1_Model1
## 9 0.427 0.573 78 Male Male Preprocessor1_Model1
## 10 0.412 0.588 85 Male Male Preprocessor1_Model1
## 11 0.426 0.574 92 Male Male Preprocessor1_Model1
##
## [[2]]
## # A tibble: 10 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.416 0.584 6 Male Female Preprocessor1_Model1
## 2 0.493 0.507 17 Male Female Preprocessor1_Model1
## 3 0.446 0.554 20 Male Female Preprocessor1_Model1
## 4 0.535 0.465 31 Female Female Preprocessor1_Model1
## 5 0.464 0.536 41 Male Female Preprocessor1_Model1
## 6 0.527 0.473 71 Female Male Preprocessor1_Model1
## 7 0.415 0.585 72 Male Male Preprocessor1_Model1
## 8 0.414 0.586 77 Male Male Preprocessor1_Model1
## 9 0.416 0.584 83 Male Male Preprocessor1_Model1
## 10 0.415 0.585 91 Male Male Preprocessor1_Model1
##
## [[3]]
## # A tibble: 9 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.430 0.570 4 Male Female Preprocessor1_Model1
## 2 0.434 0.566 12 Male Female Preprocessor1_Model1
## 3 0.393 0.607 32 Male Female Preprocessor1_Model1
## 4 0.399 0.601 38 Male Female Preprocessor1_Model1
## 5 0.404 0.596 45 Male Male Preprocessor1_Model1
## 6 0.427 0.573 47 Male Male Preprocessor1_Model1
## 7 0.427 0.573 50 Male Male Preprocessor1_Model1
## 8 0.393 0.607 69 Male Male Preprocessor1_Model1
## 9 0.525 0.475 86 Female Male Preprocessor1_Model1
##
## [[4]]
## # A tibble: 9 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.419 0.581 15 Male Female Preprocessor1_Model1
## 2 0.432 0.568 19 Male Female Preprocessor1_Model1
## 3 0.442 0.558 33 Male Female Preprocessor1_Model1
## 4 0.485 0.515 36 Male Female Preprocessor1_Model1
## 5 0.430 0.570 54 Male Male Preprocessor1_Model1
## 6 0.519 0.481 73 Female Male Preprocessor1_Model1
## 7 0.411 0.589 74 Male Male Preprocessor1_Model1
## 8 0.409 0.591 76 Male Male Preprocessor1_Model1
## 9 0.488 0.512 79 Male Male Preprocessor1_Model1
##
## [[5]]
## # A tibble: 9 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.438 0.562 26 Male Female Preprocessor1_Model1
## 2 0.490 0.510 27 Male Female Preprocessor1_Model1
## 3 0.455 0.545 29 Male Female Preprocessor1_Model1
## 4 0.418 0.582 39 Male Female Preprocessor1_Model1
## 5 0.460 0.540 43 Male Male Preprocessor1_Model1
## 6 0.436 0.564 51 Male Male Preprocessor1_Model1
## 7 0.423 0.577 56 Male Male Preprocessor1_Model1
## 8 0.431 0.569 68 Male Male Preprocessor1_Model1
## 9 0.535 0.465 88 Female Male Preprocessor1_Model1
##
## [[6]]
## # A tibble: 9 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.429 0.571 8 Male Female Preprocessor1_Model1
## 2 0.551 0.449 10 Female Female Preprocessor1_Model1
## 3 0.450 0.550 28 Male Female Preprocessor1_Model1
## 4 0.432 0.568 30 Male Female Preprocessor1_Model1
## 5 0.424 0.576 44 Male Male Preprocessor1_Model1
## 6 0.432 0.568 48 Male Male Preprocessor1_Model1
## 7 0.422 0.578 53 Male Male Preprocessor1_Model1
## 8 0.422 0.578 59 Male Male Preprocessor1_Model1
## 9 0.433 0.567 80 Male Male Preprocessor1_Model1
##
## [[7]]
## # A tibble: 9 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.531 0.469 1 Female Female Preprocessor1_Model1
## 2 0.389 0.611 2 Male Female Preprocessor1_Model1
## 3 0.436 0.564 11 Male Female Preprocessor1_Model1
## 4 0.446 0.554 24 Male Female Preprocessor1_Model1
## 5 0.444 0.556 49 Male Male Preprocessor1_Model1
## 6 0.445 0.555 65 Male Male Preprocessor1_Model1
## 7 0.431 0.569 66 Male Male Preprocessor1_Model1
## 8 0.416 0.584 89 Male Male Preprocessor1_Model1
## 9 0.417 0.583 93 Male Male Preprocessor1_Model1
##
## [[8]]
## # A tibble: 9 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.416 0.584 3 Male Female Preprocessor1_Model1
## 2 0.483 0.517 5 Male Female Preprocessor1_Model1
## 3 0.526 0.474 14 Female Female Preprocessor1_Model1
## 4 0.403 0.597 40 Male Female Preprocessor1_Model1
## 5 0.483 0.517 55 Male Male Preprocessor1_Model1
## 6 0.424 0.576 58 Male Male Preprocessor1_Model1
## 7 0.438 0.562 63 Male Male Preprocessor1_Model1
## 8 0.487 0.513 64 Male Male Preprocessor1_Model1
## 9 0.605 0.395 90 Female Male Preprocessor1_Model1
##
## [[9]]
## # A tibble: 9 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.463 0.537 7 Male Female Preprocessor1_Model1
## 2 0.441 0.559 13 Male Female Preprocessor1_Model1
## 3 0.487 0.513 22 Male Female Preprocessor1_Model1
## 4 0.563 0.437 25 Female Female Preprocessor1_Model1
## 5 0.495 0.505 61 Male Male Preprocessor1_Model1
## 6 0.446 0.554 67 Male Male Preprocessor1_Model1
## 7 0.440 0.560 70 Male Male Preprocessor1_Model1
## 8 0.437 0.563 75 Male Male Preprocessor1_Model1
## 9 0.430 0.570 84 Male Male Preprocessor1_Model1
##
## [[10]]
## # A tibble: 9 × 6
## .pred_Female .pred_Male .row .pred_class Gender .config
## <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 0.425 0.575 9 Male Female Preprocessor1_Model1
## 2 0.456 0.544 16 Male Female Preprocessor1_Model1
## 3 0.398 0.602 34 Male Female Preprocessor1_Model1
## 4 0.480 0.520 37 Male Female Preprocessor1_Model1
## 5 0.432 0.568 46 Male Male Preprocessor1_Model1
## 6 0.446 0.554 60 Male Male Preprocessor1_Model1
## 7 0.508 0.492 81 Female Male Preprocessor1_Model1
## 8 0.611 0.389 82 Female Male Preprocessor1_Model1
## 9 0.442 0.558 87 Male Male Preprocessor1_Model1
t1 <-lr_fit_train$fit$fit$fit %>% tidy()
t2 <- step_fit_train$fit$fit$fit %>% tidy()
t3 <- ridge_mod %>% tidy() %>% dplyr::select(term, estimate)
t4 <- lasso_mod %>% tidy() %>% dplyr::select(term, estimate)
t5 <- elnet_mod %>% tidy() %>% dplyr::select(term, estimate)
t6 <- results_table_test
t6 <-t6[-(2:6),]
rownames(t6) <- NULL
save(t1, file = "table1.RData")
save(t2, file = "table2.RData")
save(ridge_coeff, file = "table3.RData")
save(lasso_coeff, file = "table4.RData")
save(elnet_coeff, file = "table5.RData")
save(t6, file = "table6.RData")
lambda_plot <- glmnet::cv.glmnet(train_x, train_y, family = "binomial", alpha =1, type.measure = "auc", nfolds=9, nlamda=100000)
autoplot(lambda_plot, colour = 'red')
plot(lambda_plot)
######################################################
# Multicollinearity test
######################################################
vif_table <-vif(lr_fit_train$fit$fit$fit) %>%
tidy() %>%
rename(VIF = x) %>%
rename(Variable=names)
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
save(vif_table, file = "table_appendix1.RData")
vif(step_fit_train$fit$fit$fit) %>%
tidy() %>%
rename(VIF = x) %>%
rename(Variable=names)
## Warning: 'tidy.numeric' is deprecated.
## See help("Deprecated")
######################################################
# Linearity test
######################################################
crPlots(glm(Gender ~ (Age),
data=final_data, family=binomial), smooth=list(span=1))
crPlots(glm(Gender ~ BMI,
data=final_data, family=binomial), smooth=list(span=1))
cnf_test <- confusion.glmnet(elnet_mod, newx = test_x, newy = test_y)
cnf_test
## True
## Predicted Female Male Total
## Female 16 3 19
## Male 2 20 22
## Total 18 23 41
##
## Percent Correct: 0.878
cnf_test %>% tidy() %>%
plot_confusion_matrix(
target_col = "True",
prediction_col = "Predicted",
counts_col = "n")
## Warning: 'tidy.table' is deprecated.
## Use 'tibble::as_tibble()' instead.
## See help("Deprecated")
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'rsvg' is missing. Will not plot arrows and zero-shading.
cnfmat_lr <- read_excel(here("data","conf_mat_lr.xlsx"))
cnfmat_step <- read_excel(here("data","conf_mat_lstep.xlsx"))
lr_results_test
confusion_matrix(lr_results_test$Gender, lr_results_test$.pred_class)
cmat <- conf_mat(lr_results_test, truth = "Gender", estimate = ".pred_class")
cnf_test %>% tidy()
## Warning: 'tidy.table' is deprecated.
## Use 'tibble::as_tibble()' instead.
## See help("Deprecated")
cnfmat_lr %>% plot_confusion_matrix(
target_col = "True",
prediction_col = "Predicted",
counts_col = "n")
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'rsvg' is missing. Will not plot arrows and zero-shading.
cnfmat_step %>% plot_confusion_matrix(
target_col = "True",
prediction_col = "Predicted",
counts_col = "n")
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'ggimage' is missing. Will not plot arrows and zero-shading.
## Warning in plot_confusion_matrix(., target_col = "True", prediction_col =
## "Predicted", : 'rsvg' is missing. Will not plot arrows and zero-shading.
autoplot(glm(formula = Gender~ ., family = "binomial", data = data_train), which = 1:6, label.size = 3)
## Warning: not plotting observations with leverage one:
## 18, 19
## Warning: Removed 2 rows containing missing values (`geom_line()`).
## Warning: Removed 2 rows containing missing values (`geom_segment()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_line()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).
## Warning: Removed 2 rows containing missing values (`geom_line()`).
gvif_ <- gvif(glm(formula = Gender~ ., family = "binomial", data = data_train))
## GVIF df GVIF^(1/(2*df))
## Age 1.4694 1 1.2122
## BMI 1.3675 1 1.1694
## CM_DM 1.4288 1 1.1953
## CM_HTM 1.5279 1 1.2361
## CM_IHD 1.4757 1 1.2148
## Lung_Disease 1.1400 1 1.0677
## Symptom_Fever 1.3112 1 1.1451
## Symptom_Cough 1.4049 1 1.1853
## Symptom_Sputum 1.3968 1 1.1819
## Symptom_Runny_Nose 1.1762 1 1.0845
## Symptom_Breathlessness 1.4578 1 1.2074
## Symptom_Abdominal_Pain 1.5726 1 1.2540
## Symptom_Nausea_Vomiting 1.5609 1 1.2494
## Symptom_Diarrhoea 1.2240 1 1.1063
## Symptom_Anosmia_Hyposmia 1.2525 1 1.1192
## Symptom_Red_Eye 1.0000 1 1.0000
## Symptom_Skin_Rash 1.0000 1 1.0000
## Outcome 1.2259 1 1.1072
t10 <- (gvif_)
save(t10, file = "gvif.RData")
car::vif(glm(formula = Gender~ ., family = "binomial", data = data_train))
## Age BMI CM_DM
## 1.469415 1.367541 1.428762
## CM_HTM CM_IHD Lung_Disease
## 1.527945 1.475719 1.139998
## Symptom_Fever Symptom_Cough Symptom_Sputum
## 1.311183 1.404891 1.396843
## Symptom_Runny_Nose Symptom_Breathlessness Symptom_Abdominal_Pain
## 1.176179 1.457801 1.572631
## Symptom_Nausea_Vomiting Symptom_Diarrhoea Symptom_Anosmia_Hyposmia
## 1.560862 1.224015 1.252535
## Symptom_Red_Eye Symptom_Skin_Rash Outcome
## 1.000000 1.000000 1.225894